# Always print this out before your assignment
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods  
## [7] base     
## 
## other attached packages:
## [1] knitr_1.36
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.28   R6_2.5.1        jsonlite_1.7.2  magrittr_2.0.1 
##  [5] evaluate_0.14   rlang_0.4.12    stringi_1.7.5   jquerylib_0.1.4
##  [9] bslib_0.3.1     rmarkdown_2.11  tools_4.1.2     stringr_1.4.0  
## [13] xfun_0.28       yaml_2.2.1      fastmap_1.1.0   compiler_4.1.2 
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "C:/Users/Daniel/Documents/GitHub/MGSC-310-Project"

library("here")
library("tidyverse")
library("forcats")
library("rsample")
library("ggplot2")
library("ggmap")
library("dplyr")
library("lubridate")
library("xgboost")
library('DiagrammeR')
library('Matrix')

crashes <- read.csv(here("datasets", "US_Accidents_Dec20_updated.csv"))

summary(crashes)
##       ID               Severity      Start_Time       
##  Length:1516064     Min.   :1.000   Length:1516064    
##  Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Median :2.000   Mode  :character  
##                     Mean   :2.239                     
##                     3rd Qu.:2.000                     
##                     Max.   :4.000                     
##                                                       
##    End_Time           Start_Lat       Start_Lng          End_Lat     
##  Length:1516064     Min.   :24.57   Min.   :-124.50   Min.   :24.57  
##  Class :character   1st Qu.:33.85   1st Qu.:-118.21   1st Qu.:33.85  
##  Mode  :character   Median :37.35   Median : -94.38   Median :37.35  
##                     Mean   :36.90   Mean   : -98.60   Mean   :36.90  
##                     3rd Qu.:40.73   3rd Qu.: -80.87   3rd Qu.:40.73  
##                     Max.   :49.00   Max.   : -67.11   Max.   :49.08  
##                                                                      
##     End_Lng         Distance.mi.      Description       
##  Min.   :-124.50   Min.   :  0.0000   Length:1516064    
##  1st Qu.:-118.21   1st Qu.:  0.0000   Class :character  
##  Median : -94.38   Median :  0.1780   Mode  :character  
##  Mean   : -98.60   Mean   :  0.5873                     
##  3rd Qu.: -80.87   3rd Qu.:  0.5940                     
##  Max.   : -67.11   Max.   :155.1860                     
##                                                         
##      Number           Street              Side          
##  Min.   :      0   Length:1516064     Length:1516064    
##  1st Qu.:   1212   Class :character   Class :character  
##  Median :   4000   Mode  :character   Mode  :character  
##  Mean   :   8908                                        
##  3rd Qu.:  10100                                        
##  Max.   :9999997                                        
##  NA's   :1046095                                        
##      City              County             State          
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Zipcode            Country            Timezone        
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Airport_Code       Weather_Timestamp  Temperature.F.  
##  Length:1516064     Length:1516064     Min.   :-89.00  
##  Class :character   Class :character   1st Qu.: 47.00  
##  Mode  :character   Mode  :character   Median : 61.00  
##                                        Mean   : 59.58  
##                                        3rd Qu.: 73.00  
##                                        Max.   :170.60  
##                                        NA's   :43033   
##  Wind_Chill.F.     Humidity...      Pressure.in.   Visibility.mi.  
##  Min.   :-89.0    Min.   :  1.00   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.: 40.8    1st Qu.: 48.00   1st Qu.:29.44   1st Qu.: 10.00  
##  Median : 57.0    Median : 68.00   Median :29.88   Median : 10.00  
##  Mean   : 55.1    Mean   : 64.66   Mean   :29.55   Mean   :  9.13  
##  3rd Qu.: 71.0    3rd Qu.: 84.00   3rd Qu.:30.04   3rd Qu.: 10.00  
##  Max.   :113.0    Max.   :100.00   Max.   :58.04   Max.   :140.00  
##  NA's   :449316   NA's   :45509    NA's   :36274   NA's   :44211   
##  Wind_Direction     Wind_Speed.mph.  Precipitation.in.
##  Length:1516064     Min.   :  0.00   Min.   : 0       
##  Class :character   1st Qu.:  4.60   1st Qu.: 0       
##  Mode  :character   Median :  7.00   Median : 0       
##                     Mean   :  7.63   Mean   : 0       
##                     3rd Qu.: 10.40   3rd Qu.: 0       
##                     Max.   :984.00   Max.   :24       
##                     NA's   :128862   NA's   :510549   
##  Weather_Condition    Amenity              Bump          
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Crossing           Give_Way           Junction        
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    No_Exit            Railway           Roundabout       
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Station              Stop           Traffic_Calming   
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Traffic_Signal     Turning_Loop       Sunrise_Sunset    
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Civil_Twilight     Nautical_Twilight  Astronomical_Twilight
##  Length:1516064     Length:1516064     Length:1516064       
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
## 


crashes_clean <-
  crashes %>% 
  mutate(Start_Time_Clean = ymd_hms(Start_Time),
         End_Time_Clean = ymd_hms(End_Time),
         Weather_Condition_Clean = as.factor(Weather_Condition),
         Wind_Direction_Clean = as.factor(Wind_Direction),
         Weather_Timestamp_Clean = ymd_hms(Weather_Timestamp),
         State_Clean = as.factor(State),
         County_Clean = as.factor(County),
         City_Clean = as.factor(City),
         Side_Clean = as.factor(Side))
  

summary(crashes_clean)
##       ID               Severity      Start_Time       
##  Length:1516064     Min.   :1.000   Length:1516064    
##  Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Median :2.000   Mode  :character  
##                     Mean   :2.239                     
##                     3rd Qu.:2.000                     
##                     Max.   :4.000                     
##                                                       
##    End_Time           Start_Lat       Start_Lng          End_Lat     
##  Length:1516064     Min.   :24.57   Min.   :-124.50   Min.   :24.57  
##  Class :character   1st Qu.:33.85   1st Qu.:-118.21   1st Qu.:33.85  
##  Mode  :character   Median :37.35   Median : -94.38   Median :37.35  
##                     Mean   :36.90   Mean   : -98.60   Mean   :36.90  
##                     3rd Qu.:40.73   3rd Qu.: -80.87   3rd Qu.:40.73  
##                     Max.   :49.00   Max.   : -67.11   Max.   :49.08  
##                                                                      
##     End_Lng         Distance.mi.      Description       
##  Min.   :-124.50   Min.   :  0.0000   Length:1516064    
##  1st Qu.:-118.21   1st Qu.:  0.0000   Class :character  
##  Median : -94.38   Median :  0.1780   Mode  :character  
##  Mean   : -98.60   Mean   :  0.5873                     
##  3rd Qu.: -80.87   3rd Qu.:  0.5940                     
##  Max.   : -67.11   Max.   :155.1860                     
##                                                         
##      Number           Street              Side          
##  Min.   :      0   Length:1516064     Length:1516064    
##  1st Qu.:   1212   Class :character   Class :character  
##  Median :   4000   Mode  :character   Mode  :character  
##  Mean   :   8908                                        
##  3rd Qu.:  10100                                        
##  Max.   :9999997                                        
##  NA's   :1046095                                        
##      City              County             State          
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Zipcode            Country            Timezone        
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Airport_Code       Weather_Timestamp  Temperature.F.  
##  Length:1516064     Length:1516064     Min.   :-89.00  
##  Class :character   Class :character   1st Qu.: 47.00  
##  Mode  :character   Mode  :character   Median : 61.00  
##                                        Mean   : 59.58  
##                                        3rd Qu.: 73.00  
##                                        Max.   :170.60  
##                                        NA's   :43033   
##  Wind_Chill.F.     Humidity...      Pressure.in.   Visibility.mi.  
##  Min.   :-89.0    Min.   :  1.00   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.: 40.8    1st Qu.: 48.00   1st Qu.:29.44   1st Qu.: 10.00  
##  Median : 57.0    Median : 68.00   Median :29.88   Median : 10.00  
##  Mean   : 55.1    Mean   : 64.66   Mean   :29.55   Mean   :  9.13  
##  3rd Qu.: 71.0    3rd Qu.: 84.00   3rd Qu.:30.04   3rd Qu.: 10.00  
##  Max.   :113.0    Max.   :100.00   Max.   :58.04   Max.   :140.00  
##  NA's   :449316   NA's   :45509    NA's   :36274   NA's   :44211   
##  Wind_Direction     Wind_Speed.mph.  Precipitation.in.
##  Length:1516064     Min.   :  0.00   Min.   : 0       
##  Class :character   1st Qu.:  4.60   1st Qu.: 0       
##  Mode  :character   Median :  7.00   Median : 0       
##                     Mean   :  7.63   Mean   : 0       
##                     3rd Qu.: 10.40   3rd Qu.: 0       
##                     Max.   :984.00   Max.   :24       
##                     NA's   :128862   NA's   :510549   
##  Weather_Condition    Amenity              Bump          
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Crossing           Give_Way           Junction        
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    No_Exit            Railway           Roundabout       
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Station              Stop           Traffic_Calming   
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Traffic_Signal     Turning_Loop       Sunrise_Sunset    
##  Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Civil_Twilight     Nautical_Twilight  Astronomical_Twilight
##  Length:1516064     Length:1516064     Length:1516064       
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  Start_Time_Clean              End_Time_Clean               
##  Min.   :2016-02-08 00:37:08   Min.   :2016-02-08 06:37:08  
##  1st Qu.:2018-07-17 14:41:25   1st Qu.:2018-07-17 17:13:14  
##  Median :2020-01-24 11:16:33   Median :2020-01-24 13:38:15  
##  Mean   :2019-07-15 07:01:48   Mean   :2019-07-15 11:42:20  
##  3rd Qu.:2020-10-22 13:01:30   3rd Qu.:2020-10-22 17:50:19  
##  Max.   :2020-12-31 23:28:56   Max.   :2021-01-01 00:00:00  
##                                                             
##   Weather_Condition_Clean Wind_Direction_Clean
##  Fair         :465252     CALM   :202870      
##  Mostly Cloudy:193595     Calm   : 79192      
##  Clear        :180223     WNW    : 77743      
##  Cloudy       :161291     NW     : 75810      
##  Partly Cloudy:133102     W      : 72059      
##  Overcast     : 87853     SSW    : 69901      
##  (Other)      :294748     (Other):938489      
##  Weather_Timestamp_Clean        State_Clean    
##  Min.   :2016-02-08 00:53:00   CA     :448833  
##  1st Qu.:2018-07-10 10:55:30   FL     :153007  
##  Median :2020-01-22 05:53:00   OR     : 87484  
##  Mean   :2019-07-12 00:02:11   TX     : 75142  
##  3rd Qu.:2020-10-21 04:54:00   NY     : 60974  
##  Max.   :2020-12-31 23:35:00   MN     : 52345  
##  NA's   :30264                 (Other):638279  
##          County_Clean           City_Clean      Side_Clean 
##  Los Angeles   : 138819   Los Angeles:  39984   L: 221502  
##  Orange        :  49833   Miami      :  36233   R:1294562  
##  Miami-Dade    :  47382   Charlotte  :  22203              
##  San Bernardino:  30251   Houston    :  20843              
##  San Diego     :  26623   Dallas     :  19497              
##  Sacramento    :  25941   Sacramento :  18431              
##  (Other)       :1197215   (Other)    :1358873

qmplot(Start_Lng, Start_Lat, data = crashes, maptype = "toner-lite", color = factor(Severity))


qmplot(End_Lng, End_Lat, data = crashes, maptype = "toner-lite", color = factor(Severity))


ggplot(data = crashes_clean, aes(x = Severity)) + geom_histogram()


ggplot(data = crashes_clean, aes(x = Start_Time_Clean, y = Distance.mi.)) + geom_line()


crashes_clean %>% 
  ggplot(aes(Start_Time_Clean)) + geom_histogram(binwidth = 86400)


crashes_clean %>% 
  mutate(wday = wday(Start_Time, label = TRUE)) %>% 
  ggplot(aes(x = wday)) +
    geom_bar()


crashes_split <- initial_split(crashes_clean, prop = 0.75)

crashes_train <- training(crashes_split)
crashes_test <- testing(crashes_split)

crashes_xgb <- crashes %>% 
  select(-ID,
         -Description,
         -Street,
         -Weather_Timestamp,
         -Number,
         -Airport_Code,
         -Country,
         -Turning_Loop) %>% 
  drop_na()

crashes_split_xgb <- initial_split(crashes_xgb, prop = 0.75)

crashes_train_xbg <- training(crashes_split_xgb)
crashes_test_xbg <- testing(crashes_split_xgb)


sparse_matrix_train <- sparse.model.matrix(Severity ~ .-1, data = crashes_train_xbg)
sparse_matrix_test <- sparse.model.matrix(Severity ~ .-1, data = crashes_test_xbg)


y_train <- as.integer(crashes_train_xbg$Severity) - 1
y_test <- as.integer(crashes_test_xbg$Severity) - 1

xgb_train <- xgb.DMatrix(data = sparse_matrix_train, label = y_train)
xgb_test <- xgb.DMatrix(data = sparse_matrix_test, label = y_test)

xgb <- xgboost(data = xgb_train,
 eta = 0.1,
 max_depth = 15, 
 nround=15, 
 subsample = 0.5,
 colsample_bytree = 0.5,
 seed = 1,
 eval_metric = "merror",
 objective = "multi:softprob",
 num_class = 12,
 nthread = 3
)
## [1]  train-merror:0.121129 
## [2]  train-merror:0.119005 
## [3]  train-merror:0.116654 
## [4]  train-merror:0.116310 
## [5]  train-merror:0.115802 
## [6]  train-merror:0.114656 
## [7]  train-merror:0.113980 
## [8]  train-merror:0.113268 
## [9]  train-merror:0.112950 
## [10] train-merror:0.112491 
## [11] train-merror:0.112443 
## [12] train-merror:0.112142 
## [13] train-merror:0.111790 
## [14] train-merror:0.111420 
## [15] train-merror:0.110877

y_pred <- predict(xgb, newdata = xgb_train)
print(length(y_pred))
## [1] 8492208
print(head(y_pred))
## [1] 0.03920371 0.64076769 0.05910405 0.04018567 0.02759238 0.02759238

err <- mean(as.numeric(y_pred > 0.5))
print(paste("test-error=", err))
## [1] "test-error= 0.0724792656986263"


tree_plot <- xgb.plot.tree(model = xgb, trees = 1, feature.keep = 3)

tree_plot

importance_matrix <- xgb.importance(model = xgb)
xgb.plot.importance(importance_matrix)



importance_matrix